{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "9de58cd6",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import lightgbm as lgb\n",
    "\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.metrics import f1_score\n",
    "from sklearn.model_selection import StratifiedKFold,KFold\n",
    "\n",
    "import multiprocessing\n",
    "\n",
    "from tqdm import tqdm\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "36f84a54",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df2 = pd.read_csv('测试集/apply_new.txt',header=None,names=['pid','gender','age','targid','time','province','city','model','make'])\n",
    "df = pd.read_csv('train.txt',header=None,names=['pid','label','gender','age','targid','time','province','city','model','make'])\n",
    "df.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3a70dd46",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_fea(df):\n",
    "    df['targid_list']=df['targid'].apply(lambda x:x[1:-1].split(\",\"))\n",
    "    for i in range(30):\n",
    "        df['targid'+str(i)]=df['targid_list'].apply(lambda x:x[i] if len(x)>=i+1 else None)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "88e8225f",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "train = get_fea(df)\n",
    "test = get_fea(df2)\n",
    "for col in ['province','city']:\n",
    "    le = LabelEncoder()\n",
    "    test[col] = le.fit_transform(test[col])\n",
    "    train[col] = le.transform(train[col])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "76a876a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(30):\n",
    "    df['targid'+str(i)] = df['targid'+str(i)].astype('float64')\n",
    "    df2['targid'+str(i)] = df2['targid'+str(i)].astype('float64')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "12289c30",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "fea = [f for f in train.columns if f not in ['pid','model','make','targid_list','targid','time','label','len']]# \n",
    "model = lgb.LGBMRegressor(max_depth=15,num_leaves=20,learning_rate=0.1,n_estimators=100,seed=2020)\n",
    "\n",
    "model.fit(train[fea],train['label'])\n",
    "pre = model.predict(test[fea])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "2e2baebd",
   "metadata": {},
   "outputs": [],
   "source": [
    "test['pre'] = pre\n",
    "test['pre'] = test['pre'].apply(lambda x:1 if x>0.5 else 0)\n",
    "sub = test[['pid','pre']]\n",
    "sub = sub.rename(columns=({'pid':'user_id','pre':'category_id'}))\n",
    "sub.to_csv('sub.csv',index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "604ec138",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
